In [1]:
import os
import pandas as pd
import numpy as np
import random
import holidays
import plotly.express as px
import matplotlib.pyplot as plt 
import plotly.graph_objects as go

from tqdm import tqdm
from scipy.signal import find_peaks, detrend

import plotly.express as px 

from IPython.display import display

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, make_scorer
from sklearn.metrics import root_mean_squared_error , mean_squared_error 
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.linear_model import LinearRegression
In [2]:
def describe_time_series(df , col = None , hist_margin = 'rug' , hist_on = False ) :
    ### takes a dataframe with a datetime index
    ### returns
    ##  summary statistics 
    ## plots 

    col_ts =  col if col != None else df.columns[0] 
    ts = df.loc[:, [col_ts] ]

    ## CHECK TIMESTAMPS
    continous_index = pd.date_range ( ts.index.min() , ts.index.max() ,  freq = ts.index.freq)
    continous_dates_df = pd.DataFrame(index = continous_index ,
                                      data = ({'cont_dates': continous_index }) )
    
    measures_per_timestamp = continous_dates_df.merge(ts , left_index= True , right_index= True , how = 'left').groupby('cont_dates').count()
    measures_per_timestamp['problem'] = measures_per_timestamp[measures_per_timestamp != 1 ] 

    print(' DUPLICATE OR MISSING TIMESTAMPS' , measures_per_timestamp[measures_per_timestamp.problem == 1 ])   

    ## CHECK NULL VALUES 
    print ('MISSING VALUES in ', ts.isnull().sum()[0], ',', (ts.isnull().sum()/ts.shape[0])[0] , ' %  TIMESTEPS' ) 
    print( 'NUMBER OF TIMESTEPS' , ts.shape[0])

    ## PLOT TIMESERIES 
    line_plot  = px.line( data_frame= ts  , 
          x = ts.index , 
          y = str(col_ts) , 
          title = 'LINE PLOT ' + str(col_ts) )

    ## DISTRIBUTION PLOT 

    histo = px.histogram( ts, x= str(col_ts),  marginal=hist_margin ) # can be `box`, `violin`
    ## CHECK 

    yearly_summary = ts.resample('YS').agg( ['sum' ,'mean','min' , 'max' , 'std'])
    
    line_plot.show()
    if hist_on == True: 
          histo.show()

    print(yearly_summary)



    return 
    
In [3]:
def extract_time_features( df_in , holidays_on = True , feature_format = 'linear' ): 
    ## takes a dataframe with the index as a timeseries 
    ## gets back time related features in the dataframe 
    ## parameters : feature format : linear or sinusoidal 


    df = df_in.copy(deep=True) 

    df['is_weekend'] = (df.index.weekday > 4).astype(int)

    if holidays_on == True:
        country_holidays = holidays.Belgium()
        df['dates'] = df.index 
        df['is_holiday'] = df.dates.apply( lambda x: x in country_holidays).astype(int)
        df.drop( columns = 'dates' , inplace = True )

        

    if feature_format == 'linear': 
        df['hour'] =    df.index.hour
        df['month'] =   df.index.month
        df['weekday'] = df.index.weekday

    if feature_format == 'sinusoidal':
        df['hour_cos'] = np.cos( ( df.index.hour  ) / 23 * 2 * np.pi )
        df['hour_sin'] = np.sin( ( df.index.hour  ) / 23 * 2 * np.pi )

        df['month_cos'] = np.cos( df.index.month / 11 * 2 * np.pi )
        df['month_sin'] = np.sin( df.index.month / 11 * 2 * np.pi )

        
        df['weekday_cos'] = np.cos( (df.index.weekday ) / 6 * 2 * np.pi )
        df['weekday_sin'] = np.sin( (df.index.weekday ) / 6 * 2 * np.pi )

    return df 
In [4]:
def create_lags(df_in , col = None , n_lags = [ 96 , 96*2 ] , na_strategy = 'bfill' , verbose = False  ):

    ## Takes a dataframe and a column and a list of lags to be performed 
    ## strategy to handle na_shall_also_be_specified 

    df = df_in.copy(deep = True )
 
    col_ts =  col if col != None else df.columns[0] 

    for l in n_lags:

        df['lag_'+str(l)+'_'+col_ts] = df[col_ts].shift(l)
        
    df.fillna( method = na_strategy  , inplace = True )
    if verbose == True : 
        print(df.columns)

    return df 
    
In [5]:
def plot_all_columns( df_input , graph_title  ) :
    fig = go.Figure()

    for c in df_input.columns:
        fig.add_trace( go.Scatter( x = df_input.index , y = df_input[c] , name = str(c) ))

    fig.update_layout(title = graph_title)
    return fig.show()
In [6]:
##df_index = pd.date_range('2019-01-01' ,'2022-01-01' , freq = '15min' )  
##df_values = 10*( np.sin( df_index.hour ))

##df = pd.DataFrame( index = df_index , data = {'measure' : 1000*( np.sin( df_index.hour )),
##                                            'temperature' : 4*( np.cos( df_index.hour )) })           
In [7]:
PAR= { 'TARGET_VALUE' : 'load' , 
       'TRAIN_START' : '2019-01-01' , 
       'TRAIN_END'   :  '2023-01-01' , 
       'TEST_END' :     '2023-12-31'}

Formulation du Problème¶

Construire un modele pour prevoir tous les valeurs horaires de consommations avec 24 heures d'avance¶

Le dataset¶

Entsoe: données de consommation des réseaux europeens, disponibles d'une maniere publique. Use case: Charge electrique Suisse

https://transparency.entsoe.eu/load-domain/r2/totalLoadR2/show

In [8]:
# Input 
load = pd.read_csv('..\\data\\curated_data\\load_clean.csv', parse_dates = [2])

df  = load.copy(deep = True)
df.set_index('dt' , inplace = True) 
df.drop( columns= 'load_forecast',inplace = True)

df.head()
Out[8]:
load
dt
2019-01-01 00:00:00 7037.0
2019-01-01 01:00:00 7096.0
2019-01-01 02:00:00 7244.0
2019-01-01 03:00:00 7443.0
2019-01-01 04:00:00 7353.0
In [9]:
describe_time_series(df, col = 'load')
 DUPLICATE OR MISSING TIMESTAMPS Empty DataFrame
Columns: [load, problem]
Index: []
MISSING VALUES in  0 , 0.0  %  TIMESTEPS
NUMBER OF TIMESTEPS 50393
C:\Users\Patrizio\AppData\Local\Temp\ipykernel_20132\3123582690.py:21: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  print ('MISSING VALUES in ', ts.isnull().sum()[0], ',', (ts.isnull().sum()/ts.shape[0])[0] , ' %  TIMESTEPS' )
                  load                                           
                   sum         mean     min      max          std
dt                                                               
2019-01-01  63399414.0  7238.202306  4333.0   9924.0   981.167076
2020-01-01  62411627.0  7105.957759  4704.0   9874.0   949.418974
2021-01-01  63519856.0  7251.952963  4377.0  10242.0  1073.346090
2022-01-01  64614057.0  7376.876013  4510.0  10189.0   981.156288
2023-01-01  61054726.0  6971.309203  3747.0  10013.0   972.669413
2024-01-01  44015287.0  6694.340228  3746.0   9953.0  1128.948133
In [10]:
df.head()
Out[10]:
load
dt
2019-01-01 00:00:00 7037.0
2019-01-01 01:00:00 7096.0
2019-01-01 02:00:00 7244.0
2019-01-01 03:00:00 7443.0
2019-01-01 04:00:00 7353.0

Facteurs de dependence¶

In [11]:
prep_df  =( df.pipe(  extract_time_features , feature_format = 'sinusoidal' )
              .pipe(  create_lags , col = PAR['TARGET_VALUE'], n_lags = [24*7, 24*1,24*2] ))
 
C:\Users\Patrizio\AppData\Local\Temp\ipykernel_20132\2393523276.py:14: FutureWarning:

DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.

In [12]:
prep_df.tail()
Out[12]:
load is_weekend is_holiday hour_cos hour_sin month_cos month_sin weekday_cos weekday_sin lag_168_load lag_24_load lag_48_load
dt
2024-09-30 19:00:00 6736.0 0 0 0.460065 -8.878852e-01 0.415415 -0.909632 1.0 0.0 6971.0 5820.0 6268.0
2024-09-30 20:00:00 6876.0 0 0 0.682553 -7.308360e-01 0.415415 -0.909632 1.0 0.0 6898.0 6138.0 6158.0
2024-09-30 21:00:00 6549.0 0 0 0.854419 -5.195840e-01 0.415415 -0.909632 1.0 0.0 6543.0 5926.0 5896.0
2024-09-30 22:00:00 6672.0 0 0 0.962917 -2.697968e-01 0.415415 -0.909632 1.0 0.0 6084.0 5796.0 5582.0
2024-09-30 23:00:00 6287.0 0 0 1.000000 -2.449294e-16 0.415415 -0.909632 1.0 0.0 5793.0 6172.0 5481.0
In [13]:
plot_all_columns(prep_df[prep_df.index > '2020-04-01 00:00:00'].head(96 * 7)  , graph_title= 'Features pour une semaine ')
In [14]:
px.bar( prep_df.corr()['load'].sort_values() , title = 'Correlations avec Consommation MW' )
In [15]:
#df['temperature'] = 12 + 10 * np.cos( df.index.month / 12 * 2 * np.pi ) + 3 *  np.sin( df.index.hour / 24 * 2 * np.pi ) + 3*random.uniform(-1, 1)
#describe_time_series( df , col = 'temperature')

Separer dataset train et test¶

In [16]:
train  = prep_df[ (prep_df.index >= PAR['TRAIN_START']) &  (prep_df.index <= PAR['TRAIN_END'])]
test   = prep_df[ (df.index > PAR['TRAIN_END'] )  & ( prep_df.index < PAR['TEST_END'])]

y_train  = train[PAR['TARGET_VALUE']]
X_train  = train.drop(columns = PAR['TARGET_VALUE'] ) 

y_test  = test[PAR['TARGET_VALUE']]
X_test  = test.drop(columns = PAR['TARGET_VALUE'] ) 

Attention¶

Le referentiel de temps choisi ici c'est le temps "futur"--> le moment dans le futur ou le load est consommé

L'autre approche possible c'est le referentiel du "moment de la prevision " dans notre cas 24 heures avant le moment de la consommation. Pour simplicitè et facilitè de visualisation on choisi le referentiel "futur"

In [17]:
X_train.tail()
Out[17]:
is_weekend is_holiday hour_cos hour_sin month_cos month_sin weekday_cos weekday_sin lag_168_load lag_24_load lag_48_load
dt
2022-12-31 20:00:00 1 0 0.682553 -7.308360e-01 0.841254 0.540641 0.5 -8.660254e-01 6750.0 7125.0 7124.0
2022-12-31 21:00:00 1 0 0.854419 -5.195840e-01 0.841254 0.540641 0.5 -8.660254e-01 6598.0 6997.0 6997.0
2022-12-31 22:00:00 1 0 0.962917 -2.697968e-01 0.841254 0.540641 0.5 -8.660254e-01 6547.0 7779.0 6969.0
2022-12-31 23:00:00 1 0 1.000000 -2.449294e-16 0.841254 0.540641 0.5 -8.660254e-01 6911.0 7872.0 6667.0
2023-01-01 00:00:00 1 1 1.000000 0.000000e+00 0.841254 0.540641 1.0 -2.449294e-16 6946.0 7896.0 6904.0
In [18]:
y_train.tail()
Out[18]:
dt
2022-12-31 20:00:00    6735.0
2022-12-31 21:00:00    6988.0
2022-12-31 22:00:00    7656.0
2022-12-31 23:00:00    7691.0
2023-01-01 00:00:00    7667.0
Name: load, dtype: float64
In [19]:
fig = go.Figure() 
fig.add_trace( go.Scatter( x = y_train.index , y = y_train.values , name = 'Train'))
fig.add_trace( go.Scatter( x = y_test.index ,  y = y_test.values ,  name = 'Test'))
fig.update_layout( title = 'TRAIN - TEST')
fig.show()

¶

Avant des modeles complexes: une baseline solide !¶

On note une periodicité horaire et une difference jour et jour un modele simple mais efficace pourrait etre que la consommation de demain à l'heure X est la consommation il y a 7 jour à la meme heure

In [20]:
y_naive= prep_df[PAR['TARGET_VALUE']].shift( 24 * 7 )
y_naive_test = y_naive[y_naive.index.isin(y_test.index)]
y_naive_train= y_naive[y_naive.index.isin(y_train.index)]
In [21]:
fig = go.Figure() 

fig.add_trace( go.Scatter( x = y_test.index ,  y = y_test ,  name = 'Test'))
fig.add_trace( go.Scatter( x = y_naive_test.index ,  y = y_naive_test ,  name = 'NAIVE 7 jours'))

fig.update_layout(title = 'NAIVE MODEL VS TEST SET')
fig.show()
In [22]:
print( np.round( 100* np.mean( np.abs( y_naive_test - y_test ) / y_test ) ,3 ) , 'MAPE NAIVE 7 jours' )  
7.88 MAPE NAIVE 7 jours
In [23]:
fig_hist = go.Figure() 
fig_hist = px.histogram(x =  y_naive_test - y_test  , title = 'Distribution des erreurs' )
fig_hist.add_trace( go.Scatter( x = [0,0] , y = [0, 1000]))

Augmenter la complexité Graduellement¶

  1. Modele de regression lineaire
  2. Modele Random Forest

D'autres choix sont possibles:

  • Travailler sur les parametres des modeles
  • Travailler sur la forme du probleme en time serie
  • Travailler avec des modeles plus complexes

Documentation

  • https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html
  • https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
  • https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
In [24]:
linear_regression_pipeline = Pipeline(
[
('scaler' ,        StandardScaler() ), 
('LR' ,            LinearRegression())] ) 

linear_regression_pipeline.fit(X_train , y_train )
Out[24]:
Pipeline(steps=[('scaler', StandardScaler()), ('LR', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()), ('LR', LinearRegression())])
StandardScaler()
LinearRegression()
In [25]:
random_forest_pipeline = Pipeline(
[
('scaler' ,        StandardScaler() ), 
('RF' ,            RandomForestRegressor(n_estimators=50 ))] ) 

random_forest_pipeline.fit(X_train , y_train )
Out[25]:
Pipeline(steps=[('scaler', StandardScaler()),
                ('RF', RandomForestRegressor(n_estimators=50))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('scaler', StandardScaler()),
                ('RF', RandomForestRegressor(n_estimators=50))])
StandardScaler()
RandomForestRegressor(n_estimators=50)
In [26]:
random_forest_pipeline.predict(X_test)
Out[26]:
array([7138.9 , 7141.7 , 7449.8 , ..., 6906.52, 6912.86, 7003.28])

Travailler les hyperparametres¶

In [27]:
cv_params = { 'RF__max_depth' : [6,10] , 
             'RF__n_estimators' : [100,120 ]  }

## Cross validation 
ts_cv  = TimeSeriesSplit(n_splits = 2 , gap = 24 ) 
In [28]:
## Work multiple combinations of parameters 
grid_search_model = GridSearchCV( 
                            estimator= random_forest_pipeline , 
                            param_grid= cv_params, 
                            cv = ts_cv  , 
                            scoring = make_scorer(mean_squared_error, greater_is_better= False) ,
                            return_train_score = True , 
                            verbose = 4, ) 
In [29]:
grid_search_model.fit(X_train, y_train)
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END RF__max_depth=6, RF__n_estimators=100;, score=(train=-158075.147, test=-192762.083) total time=   4.0s
[CV 2/2] END RF__max_depth=6, RF__n_estimators=100;, score=(train=-169667.327, test=-208568.034) total time=   7.1s
[CV 1/2] END RF__max_depth=6, RF__n_estimators=120;, score=(train=-157957.416, test=-192454.423) total time=   4.5s
[CV 2/2] END RF__max_depth=6, RF__n_estimators=120;, score=(train=-169434.662, test=-208367.794) total time=   8.9s
[CV 1/2] END RF__max_depth=10, RF__n_estimators=100;, score=(train=-79510.016, test=-180145.303) total time=   6.3s
[CV 2/2] END RF__max_depth=10, RF__n_estimators=100;, score=(train=-102881.844, test=-190039.727) total time=  12.1s
[CV 1/2] END RF__max_depth=10, RF__n_estimators=120;, score=(train=-79227.235, test=-180992.600) total time=   7.4s
[CV 2/2] END RF__max_depth=10, RF__n_estimators=120;, score=(train=-102553.947, test=-190057.997) total time=  14.7s
Out[29]:
GridSearchCV(cv=TimeSeriesSplit(gap=24, max_train_size=None, n_splits=2, test_size=None),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('RF',
                                        RandomForestRegressor(n_estimators=50))]),
             param_grid={'RF__max_depth': [6, 10],
                         'RF__n_estimators': [100, 120]},
             return_train_score=True,
             scoring=make_scorer(mean_squared_error, greater_is_better=False, response_method='predict'),
             verbose=4)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=TimeSeriesSplit(gap=24, max_train_size=None, n_splits=2, test_size=None),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('RF',
                                        RandomForestRegressor(n_estimators=50))]),
             param_grid={'RF__max_depth': [6, 10],
                         'RF__n_estimators': [100, 120]},
             return_train_score=True,
             scoring=make_scorer(mean_squared_error, greater_is_better=False, response_method='predict'),
             verbose=4)
Pipeline(steps=[('scaler', StandardScaler()),
                ('RF', RandomForestRegressor(max_depth=10))])
StandardScaler()
RandomForestRegressor(max_depth=10)
In [30]:
grid_search_model.best_params_
Out[30]:
{'RF__max_depth': 10, 'RF__n_estimators': 100}

Compute Predictions¶

In [31]:
results_train = pd.DataFrame(y_train)
results_train['pred_naive_'+PAR['TARGET_VALUE']] = y_naive_train 
results_train['pred_lr_'+PAR['TARGET_VALUE']] = linear_regression_pipeline.predict(X_train)
results_train['pred_rf_'+PAR['TARGET_VALUE'] ] = random_forest_pipeline.predict(X_train)
results_train['pred_rf_cv_'+PAR['TARGET_VALUE'] ] = grid_search_model.predict(X_train)
results_train['set'] = 'train'


results = pd.DataFrame(y_test)
results['pred_naive_'+PAR['TARGET_VALUE']] = y_naive_test 
results['pred_lr_'+PAR['TARGET_VALUE']] = linear_regression_pipeline.predict(X_test)
results['pred_rf_'+PAR['TARGET_VALUE'] ] = random_forest_pipeline.predict(X_test)
results['pred_rf_cv_'+PAR['TARGET_VALUE'] ] = grid_search_model.predict(X_test)

results['set'] = 'test' 

res = pd.concat( [ results_train , results  ], ignore_index = True ).bfill()
In [32]:
plot_all_columns(results , graph_title = 'Predictions')

Metriques¶

In [33]:
res_set = []
models = []
mape_error = []
rmse_error  =[]
mse_error = []

for c in res.columns[1:-1]:
    for s in res.set.unique(): 
        
        r_df = res[res.set == s] 

        res_set.append( s )
        models.append( c )
        mape_error.append( np.round( 100*mean_absolute_percentage_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) ,  3 )) 
        rmse_error.append( np.round( root_mean_squared_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) , 3 ))
        mse_error.append(  np.round( mean_squared_error( r_df[PAR['TARGET_VALUE']].values , r_df[c].values ) , 3 ))

metrics_df = pd.DataFrame( data = {
                             'result_set' : res_set, 
                             'models' : models, 
                             'mape': mape_error , 
                             'rmse':rmse_error,
                             'mse': mse_error })

Metriques¶

In [34]:
metrics_df.pivot(columns= 'result_set' , index= 'models')
Out[34]:
mape rmse mse
result_set test train test train test train
models
pred_lr_load 6.271 4.692 560.632 438.676 314308.709 192436.782
pred_naive_load 7.880 6.257 728.273 595.701 530382.182 354859.629
pred_rf_cv_load 6.140 3.684 555.650 343.848 308747.207 118231.227
pred_rf_load 6.239 1.486 563.757 142.499 317822.080 20306.091
In [35]:
plot_all_columns(load, 'Available prediction')
In [36]:
load_test =load[ load.dt.dt.year == 2023 ]

print( "Available forecast Precision", 
      100*mean_absolute_percentage_error( load_test[PAR['TARGET_VALUE']].values , load_test['load_forecast'].values ))
Available forecast Precision 6.100084377544678
In [ ]: